Imports¶

In [1]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,precision_recall_curve,roc_curve,auc
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import label_binarize

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.offline as pyo
pyo.init_notebook_mode()

# Local imports
from ipynb.fs.defs.task3_1 import DatasetManager

Code¶

Helper Functions¶

In [2]:
# Plotting
def plot_data(*traces,x=None,y=None,z=None,title="",x_label="",y_label="",name="",mode="markers",text=""):
    fig = go.Figure(layout={
        "title": title,
        "xaxis": {"title": x_label},
        "yaxis": {"title": y_label}
    })
    
    if z is None:
        data = go.Scatter(
            x=x,
            y=y,
            mode=mode,
            name=name,
            text=text
        )
    else:
        data = go.Scatter3d(
            x=x,
            y=y,
            z=z,
            mode=mode,
            name=name,
            text=text
        )

    if x is not None and y is not None:
        fig.add_trace(data)
    
    for t in traces:
        fig.add_trace(t)
    
    return fig

def plot_bar_data(*bars, x=None, title="", x_label="", y_label=""):
    fig = go.Figure(
        layout={
            "title": title,
            "xaxis": {"title": x_label},
            "yaxis": {"title": y_label},
            "barmode": "group"
        }, data=[
            go.Bar(name=f"{bar[0]}", x=x, y=bar[1], text=bar[1], texttemplate="%{y:.4f}")
            for bar in bars
        ])
    
    return fig

def create_trace(x=None, y=None, z=None, name="", mode="lines", text="", marker_size=None):
    if z is None:
        trace = go.Scatter(
            x=x,
            y=y,
            mode=mode,
            name=name,
            text=text,
            marker=dict(size=marker_size)
        )
    else:
        trace = go.Scatter3d(
            x=x,
            y=y,
            z=z,
            mode=mode,
            name=name,
            text=text,
            marker=dict(size=marker_size)
        )
    
    return trace

def plot_collection(plots, rows, cols, subplot_titles, specs, title="", height=1000, width=1000):
    # Initialise figure
    fig = make_subplots(
        rows=rows,
        cols=cols,
        subplot_titles=subplot_titles,
        specs=specs
    )
    
    fig.update_layout({
        "title": title,
        "height": height,
        "width": width,
    })
    
    # Add traces
    for k in plots:
        current_plot = plots[k]
        n_traces = len(current_plot.data)
        for i in range(n_traces):
            fig.add_trace(current_plot.data[i], row=k[0], col=k[1])
    
    # Update axes
    for k in plots:
        fig.update_xaxes(title_text=k[2], row=k[0], col=k[1])
        fig.update_yaxes(title_text=k[3], row=k[0], col=k[1])
    
    return fig

Model Manager Class¶

In [3]:
class ModelManager:
    def __init__(self, feature_set, targets):
        self._features = feature_set
        self._targets = targets
        self._train_and_test_sets = None
        self._trained_model = None
        self._best_hps = None
        self._train_preds = None
        self._train_preds_prob = None
        self._test_preds = None
        self._test_preds_prob = None
    
    def get_features(self):
        return self._feature_set
    
    def get_targets(self):
        return self._targets
    
    def get_train_and_test_sets(self):
        return self._train_and_test_sets
    
    def get_trained_model(self):
        return self._trained_model
    
    def get_optimal_hyperparameters(self):
        return self._best_hps
    
    def split_dataset(self, train_size=0.8, test_size=0.2):
        """
        Split dataset into a training and test set.
        """
        # Splitting dataset using train_test_split
        X_train, X_test, y_train, y_test = train_test_split(
            self._features, 
            self._targets,
            train_size=train_size,
            test_size=test_size,
        )
        
        # Assigning class variables
        self._train_and_test_sets = {
            "X_train": X_train,
            "X_test": X_test,
            "y_train": y_train,
            "y_test": y_test
        }
    
    def train_model(self, model_type, cv_folds=10):
        """
        Function for training either a classifiction or regression
        model and optimising hyperparameters using cross validation.
        """
        assert self._train_and_test_sets != None, "You don't have your training and test sets."
        # Getting training and test data
        X_train = self._train_and_test_sets.get("X_train")
        y_train = self._train_and_test_sets.get("y_train")
        
        X_test = self._train_and_test_sets.get("X_test")
        y_test = self._train_and_test_sets.get("y_test")
        
        # Initialise model
        if model_type == "clf":
            estimator = SVC(probability=True)
            print("Classifier model initialised...")
        elif model_type == "reg":
            estimator = SVR()
            print("Regression model initialised...")
        
        # Specify parameters ranges to be searched
        parameter_grid = [{
            "C": [1, 10, 100],
            "gamma": [0.01, 0.1, 1],
        }]
        
        # Model fitting, cross-validation and hyperparameter optimisation using GridSearch
        model = GridSearchCV(
            estimator=estimator, 
            param_grid=parameter_grid, 
            cv=cv_folds,
            refit=True
        )
                
        print("Fitting model and performing cross-validation...")
        model.fit(X_train, y_train)
        print("Model fitting and cross-validation complete...")
        
        # Getting predictions
        print("Making predictions...")
        self._train_preds = model.predict(X_train)
        self._test_preds = model.predict(X_test)
        
        if model_type == "clf":
            self._train_preds_prob = model.predict_proba(X_train)
            self._test_preds_prob = model.predict_proba(X_test)
        
        # Assigning class variables
        self._trained_model = model
        self._best_hps = model.best_params_
        
    def _get_key_metric_plot_clf(self, y_train, y_test, train_preds, test_preds):
        """
        Returns a bar plot that visualises key 
        scores for a classification model.
        """
        # Calculating accuracies and scores
        train_acc = accuracy_score(y_train, train_preds)
        train_prec = precision_score(y_train, train_preds, average="weighted")
        train_rec = recall_score(y_train, train_preds, average="weighted")
        train_f1 = f1_score(y_train, train_preds, average="weighted")
        
        test_acc = accuracy_score(y_test, test_preds)
        test_prec = precision_score(y_test, test_preds, average="weighted")
        test_rec = recall_score(y_test, test_preds, average="weighted")
        test_f1 = f1_score(y_test, test_preds, average="weighted")
        
        # Create key metric bar plot
        x = ["Training Set", "Test Set"]
        key_metric_plot = plot_bar_data(
            ("Accuracy", [train_acc, test_acc]),
            ("Precision", [train_prec, test_prec]),
            ("Recall", [train_rec, test_rec]),
            ("F1 Score", [train_f1, test_f1]),
            x=x,
            title="Key Metrics",
            x_label="Dataset"
        )
        
        return key_metric_plot
    
    def _get_key_metric_plot_reg(self, y_train, y_test, train_preds, test_preds):
        """
        Returns a bar plot that visualises key 
        metrics for a regression model.
        """
        # Calculating errors and scores
        train_mse = mean_squared_error(y_train, train_preds)
        train_mae = mean_absolute_error(y_train, train_preds)
        train_r2 = r2_score(y_train, train_preds)
        
        test_mse = mean_squared_error(y_test, test_preds)
        test_mae = mean_absolute_error(y_test, test_preds)
        test_r2 = r2_score(y_test, test_preds)
        
        # Create key metric bar plot
        x = ["Training Set", "Test Set"]
        key_metric_plot = plot_bar_data(
            ("Mean Squared Error", [train_mse, test_mse]),
            ("Mean Absolute Error", [train_mae, test_mae]),
            ("R2 Score", [train_r2, test_r2]),
            x=x,
            title="Key Metrics",
            x_label="Dataset"
        )
        
        return key_metric_plot
    
    def _get_true_pred_plot(self, y_test, test_preds):
        """
        Creates a scatter plot that shows predicted values
        against true values.
        """
        x = np.arange(y_test.shape[0])
        y_test_trace = create_trace(x=x, y=y_test, name="True Values", mode="markers")
        y_pred_trace = create_trace(x=x, y=test_preds, name="Predicted Values", mode="markers")
        true_pred_plot = plot_data(y_test_trace, y_pred_trace, title="True vs Predicted Values")
        
        return true_pred_plot
    
    def _get_precision_recall_plot(self, y_test, test_preds_prob, encodings, classes=[0,1,2]):
        """
        Creates precsion-recall curves for all 
        classes in a classification task.
        """
        n_classes = len(classes)
        
        # Binarise true test values.
        y_test_bin = label_binarize(y_test, classes=classes)
        
        # Creating traces
        prec = dict()
        rec = dict()
        traces = []
        
        for i in range(n_classes):
            prec[i], rec[i], _ = precision_recall_curve(y_test_bin[:, i], test_preds_prob[:, i])
            curve_area = auc(rec[i], prec[i])
            new_trace = create_trace(
                x=rec[i], 
                y=prec[i], 
                name=f"P-R for Class {i} ({encodings[i]}); AUC = {round(curve_area, 3)}"
            )
            traces.append(new_trace)
        
        # Creating and returning plot
        return plot_data(
            *traces, 
            x=np.arange(0,1, 0.01), 
            title="Precision-Recall Curves",
            x_label="Recall",
            y_label="Precision",
        )
    
    def _get_roc_plot(self, y_test, test_preds_prob, encodings, classes=[0,1,2]):
        """
        Creates ROC curves for all 
        classes in a classification task.
        """
        n_classes = len(classes)
        
        # Binarise true test set values
        y_test_bin = label_binarize(y_test, classes=classes)
        
        # Creating traces
        fpr = dict()
        tpr = dict()
        traces = []
        
        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], test_preds_prob[:, i])
            curve_area = auc(fpr[i], tpr[i])
            new_trace = create_trace(
                x=fpr[i], 
                y=tpr[i],
                name=f"ROC for Class {i} ({encodings[i]}); AUC = {round(curve_area, 3)}"
            )
            traces.append(new_trace)
        
        # Creating and returning plot
        return plot_data(
            *traces, 
            x=np.arange(0,1, 0.01), 
            title="ROC Curves",
            x_label="False-Positive Rate",
            y_label="True-Positive Rate",
        )
    
    def visualise_results_clf(self, encodings):
        """
        Creates a series of plots to visualise performance
        results for a classification model.
        """
        assert self._trained_model != None, "You haven't trained a model yet."
        
        # Getting model
        model = self._trained_model
        
        # Getting training, test and predictions data
        y_train = self._train_and_test_sets.get("y_train")
        y_test = self._train_and_test_sets.get("y_test")
        train_preds = self._train_preds
        test_preds = self._test_preds
        test_preds_prob = self._test_preds_prob
        
        # Get key metric plot
        key_metric_plot = self._get_key_metric_plot_clf(y_train, y_test, train_preds, test_preds)
        
        # True values vs predictions
        true_pred_plot = self._get_true_pred_plot(y_test, test_preds)
        
        # Precision-recall curves
        prec_rec_plot = self._get_precision_recall_plot(y_test, test_preds_prob, encodings)
        
        # ROC curves
        roc_plot = self._get_roc_plot(y_test, test_preds_prob, encodings)
        
        # Combining plots
        plots = {
            (1,1,"Dataset",""): key_metric_plot,
            (2,1,"","Class"): true_pred_plot,
            (3,1,"Recall","Precision"): prec_rec_plot,
            (3,2,"False-Positive Rate","True-Positive Rate"): roc_plot
        }
        
        subplot_titles = [
            "Key Metrics", 
            "True vs Predicted Values", 
            "Precision-Recall Curves",
            "ROC Curves",
        ]
        
        specs = [
            [{"type": "bar", "colspan": 2}, None],
            [{"type": "xy", "colspan": 2}, None],
            [{"type": "xy"}, {"type": "xy"}],
        ]
        
        combined_plot = plot_collection(
            plots, 
            rows=3, 
            cols=2, 
            subplot_titles=subplot_titles, 
            specs=specs, 
            title="Model Performance Results", 
        )
        
        return combined_plot
    
    def visualise_results_reg(self):
        """
        Creates a series of plots to visualise performance
        results for a regression model.
        """
        assert self._trained_model != None, "You haven't trained a model yet."
        
        # Getting model
        model = self._trained_model
        
        # Getting training, test and predictions data
        y_train = self._train_and_test_sets.get("y_train")
        y_test = self._train_and_test_sets.get("y_test")
        train_preds = self._train_preds
        test_preds = self._test_preds
        
        # Get key metric plot
        key_metric_plot = self._get_key_metric_plot_reg(y_train, y_test, train_preds, test_preds)
        
        # True values vs predictions
        true_pred_plot = self._get_true_pred_plot(y_test, test_preds)
        
        # Combining plots
        plots = {
            (1,1,"Dataset",""): key_metric_plot,
            (2,1,"","Productivity"): true_pred_plot,
        }
        
        subplot_titles = ["Key Metrics","True vs Predicted Values"]
        
        specs = [
            [{"type": "bar"}],
            [{"type": "xy"}],
        ]
        
        combined_plot = plot_collection(
            plots, 
            rows=2, 
            cols=1, 
            subplot_titles=subplot_titles, 
            specs=specs, 
            title="Model Performance Results",
            height=900
        )
        
        return combined_plot

Loading Datasets¶

In [4]:
# Productivity dataset; using optimal configuration as determined in Task3-1
gwp_dsm = DatasetManager("gwp_assessment")
gwp_dsm.load_and_preprocess([0,1,2,3], "iterative")
gwp_dsm.create_feature_set(7)
gwp_dsm.scale_feature_set()

# Star dataset; using optimal configuration as determined in Task3-1
star_dsm = DatasetManager("star_assessment")
star_dsm.load_and_preprocess([0,1,8,9,12,16,17], "knn")
star_dsm.create_feature_set(8)
star_dsm.scale_feature_set()
Dataset loaded...
Dataset cleaned..
Dataset encodings..
Dataset numerised...
Missing values imputed...
Dataset loaded...
Dataset cleaned..
Dataset encodings..
Dataset numerised...
Missing values imputed...

Getting targets and features¶

In [5]:
# Productivity dataset
gwp_features = gwp_dsm.get_scaled_feat_ds()
gwp_targets = gwp_dsm.get_complete_ds()[:, -1]

# Star dataset
star_features = star_dsm.get_scaled_feat_ds()
star_targets = star_dsm.get_complete_ds()[:, -1]

Initialising Model Managers¶

In [6]:
# GWP dataset
gwp_mm = ModelManager(gwp_features, gwp_targets)

# Star dataset
star_mm = ModelManager(star_features, star_targets)

Model Evaluation¶

Methodology

  1. Datasets will be split into training and test sets.
  2. Models will be trained on training sets; cross validation will be used to optimise hyperparameters.
  3. Model performance will be evaluated using selected evaluation metrics; the results will then be visualised to paint full picture of a model's performance.
  4. Steps 1-3 will be repeated for several training-test splits (80-20, 75-25, 70-30, 60-40, 50-50) to assess the effect of split ratio on model performance.

Evaluation metrics

  • Productivity dataset: accuracy, precision, recall, F1 score. These metrics are ideal metrics for evaluating classification models as they provide comprehensive insight into a model's performance. Accuracy helps understand the overall effectiveness of the model. However, it can be misleading in imbalanced datasets, which is where precision and recall come in. They provide a more nuanced view of the model's ability to correctly identify positive instances and avoid false positives. The F1 score harmonises precision and recall, offering a single metric that seeks a balance between these two characteristics, making it especially useful when the costs of false positives and false negatives are significantly different.

  • Star dataset: mean squared error (MSE), mean abolute error (MAE), R2 score. These are robust metrics for evaluating regression models, with each illuminating different aspects of model performance. MSE emphasizes larger errors by squaring residuals, making it useful when larger errors are undesirable. MAE provides a more straightforward measure of average error magnitude, regardless of direction. The R2 score complements these by providing a relative measure of how much variance the model can explain, giving a broader picture of model performance beyond just raw error. These combined provide a comprehensive assessment of the model's effectiveness.

Notes

  • Due to the size of the star dataset (as well as the limitations of the machine on which this program was developed) only small subset of the dataset (approximately 2%) will be used to train models.

80-20 Split¶

Splitting Datasets into Train and Test Sets¶

In [7]:
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.8, test_size=0.2)

# Splitting star dataset
star_mm.split_dataset(train_size=0.016, test_size=0.004)

Model Training¶

In [8]:
# Productivity dataset
gwp_mm.train_model("reg", 10)
Regression model initialised...
Fitting model and performing cross-validation...
Model fitting and cross-validation complete...
Making predictions...
In [9]:
# Star dataset
star_mm.train_model("clf", 10)
Classifier model initialised...
Fitting model and performing cross-validation...
Model fitting and cross-validation complete...
Making predictions...

Results¶

In [10]:
# Productivity dataset
gwp_mm.visualise_results_reg()
In [11]:
# Star dataset
encodings = star_dsm.get_encodings()[17]
star_mm.visualise_results_clf(encodings)

Analysis¶

Productivity dataset

  • The model acheives a satisfactorally low score across all error metrics (MSE and MAE) in both the training and test sets.
  • However, the R2 score suggests that the model is unlikely to generalise well to novel data and may therefore require more fine-tuning.

Star dataset

  • The model's accuracy, precision and recall scores are consistent across both the training and test sets, with the model scoring roughly 10 percentage points better on the training set across all metrics.
  • The scores indicate that the model has neither over- or under- fitted on the training data and can generalise well to new data.
  • The precision-recall curves show that the model has a fairly low average precision for classes 1 and 2; the average precision for class 0 is somewhat better.
  • The precision-recall curves also show that the model is unable to simulataneously have good recall and good precision for any of the classes in the dataset.
  • The ROC curves show that the model has a good true-positive rate for all 3 classes, with class 2 being the model's weakest class and class 1 being the strongest.

75-25 Split¶

Splitting Datasets into Train and Test Sets¶

In [12]:
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.75, test_size=0.25)

# Splitting star dataset
star_mm.split_dataset(train_size=0.015, test_size=0.005)

Model Training¶

In [13]:
# Productivity dataset
gwp_mm.train_model("reg", 10)
Regression model initialised...
Fitting model and performing cross-validation...
Model fitting and cross-validation complete...
Making predictions...
In [14]:
# Star dataset
star_mm.train_model("clf", 10)
Classifier model initialised...
Fitting model and performing cross-validation...
Model fitting and cross-validation complete...
Making predictions...

Results¶

In [15]:
# Productivity dataset
gwp_mm.visualise_results_reg()
In [16]:
# Star dataset
encodings = star_dsm.get_encodings()[17]
star_mm.visualise_results_clf(encodings)

Analysis¶

Productivity dataset

  • The model acheives a satisfactorally low score across all error metrics (MSE and MAE) in both the training and test sets.
  • However, the R2 score suggests that the model is unlikely to generalise well to novel data and may therefore require more fine-tuning.

Star dataset

  • The model's accuracy, precision and recall scores are consistent across both the training and test sets, with the model scoring only slightly better on the training set across all metrics.
  • The scores indicate that the model has neither over- or under- fitted on the training data and can generalise well to new data.
  • The precision-recall curves show that the model has a fairly low average precision for classes 1 and 2; the average precision for class 0 is somewhat better.
  • The precision-recall curves also show that the model is unable to simulataneously have good recall and good precision for any of the classes in the dataset.
  • The ROC curves show that the model has a good true-positive rate for all 3 classes, with class 2 being the model's weakest class and class 1 being the strongest.

70-30 Split¶

Splitting Datasets into Train and Test Sets¶

In [17]:
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.7, test_size=0.3)

# Splitting star dataset
star_mm.split_dataset(train_size=0.014, test_size=0.006)

Model Training¶

In [18]:
# Productivity dataset
gwp_mm.train_model("reg", 10)
Regression model initialised...
Fitting model and performing cross-validation...
Model fitting and cross-validation complete...
Making predictions...
In [19]:
# Star dataset
star_mm.train_model("clf", 10)
Classifier model initialised...
Fitting model and performing cross-validation...
Model fitting and cross-validation complete...
Making predictions...

Results¶

In [20]:
# Productivity dataset
gwp_mm.visualise_results_reg()
In [21]:
# Star dataset
encodings = star_dsm.get_encodings()[17]
star_mm.visualise_results_clf(encodings)

Analysis¶

Productivity dataset

  • The model acheives a satisfactorally low score across all error metrics (MSE and MAE) in both the training and test sets.
  • However, the low R2 scores (for both the training and test sets) suggest that the model is unlikely to generalise well to novel data and may therefore require more fine-tuning.

Star dataset

  • The model's accuracy, precision and recall scores are consistent across both the training and test sets, with the model scoring roughly 10-12 percentage points better on the training set across all metrics.
  • The scores indicate that the model has neither over- or under- fitted on the training data and can generalise well to new data.
  • The precision-recall curves show that the model has a fairly low average precision for classes 1 and 2; the average precision for class 0 is somewhat better.
  • The precision-recall curves also show that the model is unable to simulataneously have good recall and good precision for any of the classes in the dataset.
  • The ROC curves show that the model has a good true-positive rate for all 3 classes, with class 2 being the model's weakest class and class 0 being the strongest.

60-40 Split¶

Splitting Datasets into Train and Test Sets¶

In [22]:
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.6, test_size=0.4)

# Splitting star dataset
star_mm.split_dataset(train_size=0.012, test_size=0.008)

Model Training¶

In [23]:
# Productivity dataset
gwp_mm.train_model("reg", 10)
Regression model initialised...
Fitting model and performing cross-validation...
Model fitting and cross-validation complete...
Making predictions...
In [24]:
# Star dataset
star_mm.train_model("clf", 10)
Classifier model initialised...
Fitting model and performing cross-validation...
Model fitting and cross-validation complete...
Making predictions...

Results¶

In [25]:
# Productivity dataset
gwp_mm.visualise_results_reg()
In [26]:
# Star dataset
encodings = star_dsm.get_encodings()[17]
star_mm.visualise_results_clf(encodings)

Analysis¶

Productivity dataset

  • The model acheives a satisfactorally low score across all error metrics (MSE and MAE) in both the training and test sets.
  • However, the low R2 scores suggest that the model is unlikely to generalise well to novel data and may therefore require more fine-tuning.

Star dataset

  • The model's accuracy, precision and recall scores are consistent across both the training and test sets, with the model scoring roughly 10-12 percentage points better on the training set across all metrics.
  • The scores indicate that the model has neither over- or under- fitted on the training data and can generalise well to new data.
  • The precision-recall curves show that the model has a fairly low average precision for classes 1 and 2; the average precision for class 0 is somewhat better.
  • The precision-recall curves also show that the model is unable to simulataneously have good recall and good precision for any of the classes in the dataset.
  • The ROC curves show that the model has a good true-positive rate for all 3 classes, with class 2 being the model's weakest class and class 1 being the strongest.

50-50 Split¶

Splitting Datasets into Train and Test Sets¶

In [27]:
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.5, test_size=0.5)

# Splitting star dataset
star_mm.split_dataset(train_size=0.01, test_size=0.01)

Model Training¶

In [28]:
# Productivity dataset
gwp_mm.train_model("reg", 10)
Regression model initialised...
Fitting model and performing cross-validation...
Model fitting and cross-validation complete...
Making predictions...
In [29]:
# Star dataset
star_mm.train_model("clf", 10)
Classifier model initialised...
Fitting model and performing cross-validation...
Model fitting and cross-validation complete...
Making predictions...

Results¶

In [30]:
# Productivity dataset
gwp_mm.visualise_results_reg()
In [31]:
# Star dataset
encodings = star_dsm.get_encodings()[17]
star_mm.visualise_results_clf(encodings)

Analysis¶

Productivity dataset

  • The model acheives a satisfactorally low score across all error metrics (MSE and MAE) in both the training and test sets.
  • However, the low R2 scores suggest that the model is unlikely to generalise well to novel data and may therefore require more fine-tuning.

Star dataset

  • The model's accuracy, precision and recall scores are consistent across both the training and test sets, with the model scoring roughly 15 percentage points better on the training set across all metrics.
  • The scores indicate that the model may have overfitted slightly on the training data.
  • The precision-recall curves show that the model has a fairly low average precision for classes 1 and 2; the average precision for class 0 is much better.
  • The precision-recall curves also show that the model is unable to simulataneously have good recall and good precision for any of the classes in the dataset.
  • The ROC curves show that the model has a good true-positive rate for all 3 classes, with class 2 being the model's weakest class and class 1 being the strongest.

Analysis of split ratios¶

Productivity dataset

  • The overall accuracy of the model is relatively unaffected by the changing of split ratio. The MSE and MAE scores are fairly consistent across all split ratios.
  • The R2 scores vary arbitrarily with the choice of split ratio. A robust relationship/correlation cannot be determined; this may require further investigation.
  • The 70-30 split appears to be optimal.

Star dataset

  • Whilst model performance (across all metrics) is adequate for all split ratios, the gap in performance on the training set and test set begins to widen as the split ratio approaches 50-50; this suggests that as the model is fed less and less training data, is starts to underfit.
  • The true-positive rate also decreases slightly as the split ratio approaches 50-50.
  • Conversely, the average precision (for all 3 classes) picks up slightly as the ratio approaches 50-50. This would suggest that split ratios closer to 50-50 have a better distribution of all the classes in the dataset.
  • The 75-25 split appears to be optimal.

Markdown Answer¶

The targets (i.e. the values we're trying to predict) for the star dataset all belong to one of a discrete set of classes. Linear regression isn't ideal for predicting discrete classes; rather it is designed for continuous outputs. It calculates the best-fitting line through a set of data and predicts values (which can be any number in the set of real numbers) along that line. When working with discrete classes, these predictions often don't make sense. E.g., if we're predicting classes labeled 0, 1, and 2, linear regression might give us 1.2785, which doesn't correspond to any class. Futhermore, linear regression is sensitive to outliers, which can significantly skew the best-fitting line and hence, the predictions. In classification problems, this can lead to many misclassifications.

In [ ]: